STEP 1: Import Libraries¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from scipy import stats
import folium
from folium.plugins import HeatMap
from IPython.display import display, clear_output
import ipywidgets as widgets
import networkx as nx
from geopy.distance import geodesic
import random
import time
import zipfile
C:\Users\teste\anaconda3\Lib\site-packages\pandas\core\arrays\masked.py:61: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

STEP 2: Load Dataset¶

In [2]:
df = pd.read_csv(r"C:\Users\teste\Downloads\smart_traffic_management_dataset.csv")
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   timestamp             2000 non-null   object 
 1   location_id           2000 non-null   int64  
 2   traffic_volume        2000 non-null   int64  
 3   avg_vehicle_speed     2000 non-null   float64
 4   vehicle_count_cars    2000 non-null   int64  
 5   vehicle_count_trucks  2000 non-null   int64  
 6   vehicle_count_bikes   2000 non-null   int64  
 7   weather_condition     2000 non-null   object 
 8   temperature           2000 non-null   float64
 9   humidity              2000 non-null   float64
 10  accident_reported     2000 non-null   int64  
 11  signal_status         2000 non-null   object 
dtypes: float64(3), int64(6), object(3)
memory usage: 187.6+ KB
In [4]:
df.head()
Out[4]:
timestamp location_id traffic_volume avg_vehicle_speed vehicle_count_cars vehicle_count_trucks vehicle_count_bikes weather_condition temperature humidity accident_reported signal_status
0 2024-01-01 00:00:00 4 504 53.124162 142 24 44 Cloudy 33.334387 36.390698 0 Red
1 2024-01-01 00:01:00 5 209 44.947850 862 50 23 Cloudy 17.926830 37.640927 0 Green
2 2024-01-01 00:02:00 3 572 63.179229 317 12 10 Windy 33.483375 84.262610 1 Red
3 2024-01-01 00:03:00 5 699 42.269697 709 43 21 Sunny 19.212941 61.550978 0 Yellow
4 2024-01-01 00:04:00 5 639 72.185791 594 34 14 Cloudy 11.349244 77.494506 0 Red
In [5]:
df.describe()
Out[5]:
location_id traffic_volume avg_vehicle_speed vehicle_count_cars vehicle_count_trucks vehicle_count_bikes temperature humidity accident_reported
count 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000
mean 2.991500 540.959000 50.009035 449.872500 49.269000 24.117000 22.447538 60.105652 0.054000
std 1.430892 271.933985 17.267898 253.695741 28.830196 14.297011 7.200535 17.245294 0.226074
min 1.000000 50.000000 20.011192 20.000000 0.000000 0.000000 10.025557 30.007119 0.000000
25% 2.000000 309.000000 35.168589 227.750000 24.000000 12.000000 16.410169 45.179396 0.000000
50% 3.000000 549.000000 50.412652 452.000000 49.000000 24.000000 22.364844 60.450407 0.000000
75% 4.000000 774.000000 65.135366 660.250000 74.000000 36.000000 28.563564 74.805580 0.000000
max 5.000000 998.000000 79.972635 899.000000 99.000000 49.000000 34.990891 89.989003 1.000000

STEP 3: Preprocessing¶

In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
label_encoder = LabelEncoder()
df['weather_condition'] = label_encoder.fit_transform(df['weather_condition'])
df['accident_reported'] = df['accident_reported'].map({'Yes':1, 'No':0})
df['signal_status'] = df['signal_status'].map({'Green':0,'Yellow':1,'Red':2})
df.fillna(df.mean(), inplace=True)

numeric_features = ['traffic_volume','avg_vehicle_speed','vehicle_count_cars',
                    'vehicle_count_trucks','vehicle_count_bikes','temperature',
                    'humidity','accident_reported','signal_status']

STEP 4: EDA (Exploratory Data Analysis)¶

In [7]:
def eda_overview():
    print("Dataset Shape:", df.shape)
    print("\nDataset Info:")
    print(df.info())
    print("\nDataset Description:")
    display(df.describe())
    print("\nMissing Values per Column:")
    print(df.isnull().sum())
    print("\nDistribution of Signal Status:")
    sns.countplot(x='signal_status', data=df)
    plt.show()
    
eda_overview()
Dataset Shape: (2000, 12)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   timestamp             2000 non-null   datetime64[ns]
 1   location_id           2000 non-null   int64         
 2   traffic_volume        2000 non-null   int64         
 3   avg_vehicle_speed     2000 non-null   float64       
 4   vehicle_count_cars    2000 non-null   int64         
 5   vehicle_count_trucks  2000 non-null   int64         
 6   vehicle_count_bikes   2000 non-null   int64         
 7   weather_condition     2000 non-null   int32         
 8   temperature           2000 non-null   float64       
 9   humidity              2000 non-null   float64       
 10  accident_reported     0 non-null      float64       
 11  signal_status         2000 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int32(1), int64(6)
memory usage: 179.8 KB
None

Dataset Description:
timestamp location_id traffic_volume avg_vehicle_speed vehicle_count_cars vehicle_count_trucks vehicle_count_bikes weather_condition temperature humidity accident_reported signal_status
count 2000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 2000.000000 0.0 2000.00000
mean 2024-01-01 16:39:29.999999744 2.991500 540.959000 50.009035 449.872500 49.269000 24.117000 2.016500 22.447538 60.105652 NaN 1.03700
min 2024-01-01 00:00:00 1.000000 50.000000 20.011192 20.000000 0.000000 0.000000 0.000000 10.025557 30.007119 NaN 0.00000
25% 2024-01-01 08:19:45 2.000000 309.000000 35.168589 227.750000 24.000000 12.000000 1.000000 16.410169 45.179396 NaN 0.00000
50% 2024-01-01 16:39:30 3.000000 549.000000 50.412652 452.000000 49.000000 24.000000 2.000000 22.364844 60.450407 NaN 1.00000
75% 2024-01-02 00:59:15 4.000000 774.000000 65.135366 660.250000 74.000000 36.000000 3.000000 28.563564 74.805580 NaN 2.00000
max 2024-01-02 09:19:00 5.000000 998.000000 79.972635 899.000000 99.000000 49.000000 4.000000 34.990891 89.989003 NaN 2.00000
std NaN 1.430892 271.933985 17.267898 253.695741 28.830196 14.297011 1.449234 7.200535 17.245294 NaN 0.80309
Missing Values per Column:
timestamp                  0
location_id                0
traffic_volume             0
avg_vehicle_speed          0
vehicle_count_cars         0
vehicle_count_trucks       0
vehicle_count_bikes        0
weather_condition          0
temperature                0
humidity                   0
accident_reported       2000
signal_status              0
dtype: int64

Distribution of Signal Status:

STEP 5: UNIVARIATE ANALYSIS¶

In [8]:
univariate_dropdown = widgets.Dropdown(
    options=numeric_features,
    description='Feature:',
    value='traffic_volume'
)

def plot_univariate(feature):
    plt.figure(figsize=(6,4))
    sns.histplot(df[feature], kde=True, bins=20)
    plt.title(f'Univariate Analysis: {feature}')
    plt.show()
    
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[feature])
    plt.title(f'Boxplot: {feature}')
    plt.show()

display(widgets.interactive(plot_univariate, feature=univariate_dropdown))
interactive(children=(Dropdown(description='Feature:', options=('traffic_volume', 'avg_vehicle_speed', 'vehicl…

STEP 6: BIVARIATE ANALYSIS¶

In [9]:
x_dropdown = widgets.Dropdown(options=numeric_features, description='X-axis:', value='traffic_volume')
y_dropdown = widgets.Dropdown(options=numeric_features, description='Y-axis:', value='avg_vehicle_speed')

def plot_bivariate(x_feature, y_feature):
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=df[x_feature], y=df[y_feature], hue=df['signal_status'], palette='coolwarm')
    plt.title(f'Bivariate Analysis: {x_feature} vs {y_feature}')
    plt.show()

display(widgets.interactive(plot_bivariate, x_feature=x_dropdown, y_feature=y_dropdown))
interactive(children=(Dropdown(description='X-axis:', options=('traffic_volume', 'avg_vehicle_speed', 'vehicle…

STEP 7: MULTIVARIATE CORRELATION¶

In [10]:
def plot_correlation():
    plt.figure(figsize=(8,6))
    sns.heatmap(df[numeric_features].corr(), annot=True, cmap='coolwarm')
    plt.title("Multivariate Correlation Heatmap")
    plt.show()
plot_correlation()

STEP 8: OUTLIER DETECTION¶

In [11]:
z_scores = np.abs(stats.zscore(df[numeric_features]))
outliers = (z_scores > 3)
df_outliers = df[(outliers).any(axis=1)]

outlier_dropdown = widgets.Dropdown(options=numeric_features, description='Feature:')
def show_outliers(feature):
    outlier_values = df_outliers[feature]
    print(f"Number of outliers in {feature}: {len(outlier_values)}")
    display(outlier_values)

display(widgets.interactive(show_outliers, feature=outlier_dropdown))
interactive(children=(Dropdown(description='Feature:', options=('traffic_volume', 'avg_vehicle_speed', 'vehicl…
In [19]:
numeric_cols = ['traffic_volume', 'avg_vehicle_speed', 
                'vehicle_count_cars', 'vehicle_count_trucks', 
                'vehicle_count_bikes', 'temperature', 'humidity']

# Create a boxplot for all numeric columns
plt.figure(figsize=(12,6))
sns.boxplot(data=df[numeric_cols])
plt.title("Outlier Detection in Traffic Dataset")
plt.xticks(rotation=45)
plt.show()

STEP 9: TRAFFIC HOTSPOT MAP¶

In [12]:
def plot_hotspot(min_signal_status):
    m = folium.Map(location=[df['location_id'].mean(), df['location_id'].mean()], zoom_start=12)
    heat_data = [[row['location_id'], row['location_id']] for idx, row in df.iterrows() if row['signal_status']>=min_signal_status]
    HeatMap(heat_data).add_to(m)
    display(m)

hotspot_slider = widgets.IntSlider(min=0, max=2, step=1, value=2, description='Min Signal Status:')
display(widgets.interactive(plot_hotspot, min_signal_status=hotspot_slider))
interactive(children=(IntSlider(value=2, description='Min Signal Status:', max=2), Output()), _dom_classes=('w…

STEP 10: PREDICTIVE MODELING¶

In [13]:
features = ['traffic_volume','avg_vehicle_speed','vehicle_count_cars','vehicle_count_trucks',
            'vehicle_count_bikes','temperature','humidity','accident_reported']
target = 'signal_status'

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

def predict_signal_status(traffic_volume, avg_vehicle_speed, vehicle_count_cars, vehicle_count_trucks,
                          vehicle_count_bikes, temperature, humidity, accident_reported):
    accident_val = 1 if accident_reported=='Yes' else 0
    new_data = pd.DataFrame({'traffic_volume':[traffic_volume],
                             'avg_vehicle_speed':[avg_vehicle_speed],
                             'vehicle_count_cars':[vehicle_count_cars],
                             'vehicle_count_trucks':[vehicle_count_trucks],
                             'vehicle_count_bikes':[vehicle_count_bikes],
                             'temperature':[temperature],
                             'humidity':[humidity],
                             'accident_reported':[accident_val]})
    pred = model.predict(new_data)[0]
    mapping = {0:'Green', 1:'Yellow', 2:'Red'}
    print(f"Predicted Signal Status: {mapping[pred]}")

display(widgets.interactive(predict_signal_status,
                             traffic_volume=widgets.IntSlider(min=0, max=1000, value=100, step=1),
                             avg_vehicle_speed=widgets.IntSlider(min=0,max=120,value=30,step=1),
                             vehicle_count_cars=widgets.IntSlider(min=0,max=500,value=50,step=1),
                             vehicle_count_trucks=widgets.IntSlider(min=0,max=200,value=10,step=1),
                             vehicle_count_bikes=widgets.IntSlider(min=0,max=300,value=20,step=1),
                             temperature=widgets.IntSlider(min=-10,max=50,value=25,step=1),
                             humidity=widgets.IntSlider(min=0,max=100,value=50,step=1),
                             accident_reported=widgets.Dropdown(options=['Yes','No'],description='Accident')
                            ))
interactive(children=(IntSlider(value=100, description='traffic_volume', max=1000), IntSlider(value=30, descri…

STEP 11: SIMPLE ROUTE OPTIMIZATION¶

In [14]:
unique_locations = df['location_id'].unique()
location_coords = {}
np.random.seed(42)
for loc in unique_locations:
    # Random coordinates for simulation
    location_coords[loc] = (12.9 + np.random.rand()/10, 77.5 + np.random.rand()/10)

df['Latitude'] = df['location_id'].map(lambda x: location_coords[x][0])
df['Longitude'] = df['location_id'].map(lambda x: location_coords[x][1])

def simple_route(start_loc, end_loc, min_signal_status=2):
    start_lat, start_lon = location_coords[start_loc]
    end_lat, end_lon = location_coords[end_loc]
    
    m = folium.Map(location=[(start_lat+end_lat)/2, (start_lon+end_lon)/2], zoom_start=13)
    
    # Mark start and end
    folium.Marker([start_lat, start_lon], tooltip="Start", icon=folium.Icon(color='green')).add_to(m)
    folium.Marker([end_lat, end_lon], tooltip="End", icon=folium.Icon(color='red')).add_to(m)
    
    # Hotspots
    hotspot_coords = df[df['signal_status']>=min_signal_status][['Latitude','Longitude']].values
    for coord in hotspot_coords:
        folium.CircleMarker(location=coord, radius=5, color='orange', fill=True, fill_opacity=0.7).add_to(m)
    
    # Simple straight-line route avoiding hotspots
    route_points = [[start_lat, start_lon]]
    steps = 10
    lat_step = (end_lat - start_lat)/steps
    lon_step = (end_lon - start_lon)/steps
    current_point = [start_lat, start_lon]
    
    for i in range(1, steps+1):
        next_point = [current_point[0]+lat_step, current_point[1]+lon_step]
        for h in hotspot_coords:
            if geodesic(next_point,h).km < 0.5:
                next_point[0] += 0.001
                next_point[1] += 0.001
        route_points.append(next_point)
        current_point = next_point
    
    folium.PolyLine(route_points, color='blue', weight=4, opacity=0.7, tooltip='Route').add_to(m)
    display(m)

display(widgets.interactive(simple_route,
                             start_loc=widgets.Dropdown(options=unique_locations, description='Start Location'),
                             end_loc=widgets.Dropdown(options=unique_locations, description='End Location'),
                             min_signal_status=widgets.IntSlider(min=0,max=2,step=1,value=2)
                            ))
interactive(children=(Dropdown(description='Start Location', options=(4, 5, 3, 2, 1), value=4), Dropdown(descr…

STEP 12: SHORTEST PATH ROUTE OPTIMIZATION (Dijkstra)¶

In [15]:
G = nx.Graph()
for idx, row in df.iterrows():
    G.add_node(row['location_id'], pos=(row['Latitude'], row['Longitude']))

# Connect locations within threshold distance
threshold_km = 1  # 1 km radius
for i in unique_locations:
    for j in unique_locations:
        if i==j: continue
        coord_i = location_coords[i]
        coord_j = location_coords[j]
        distance = geodesic(coord_i, coord_j).km
        if distance <= threshold_km:
            # Weight includes congestion penalty
            sig_i = df[df['location_id']==i]['signal_status'].mean()
            sig_j = df[df['location_id']==j]['signal_status'].mean()
            weight = distance*(1 + sig_i + sig_j)
            G.add_edge(i,j,weight=weight)

def optimized_shortest_path(start_loc, end_loc):
    path = nx.dijkstra_path(G, start_loc, end_loc, weight='weight')
    route_coords = [location_coords[p] for p in path]
    
    m = folium.Map(location=[np.mean([c[0] for c in route_coords]), np.mean([c[1] for c in route_coords])], zoom_start=13)
    folium.PolyLine(route_coords, color='blue', weight=4, opacity=0.7, tooltip='Optimized Route').add_to(m)
    
    folium.Marker(location_coords[start_loc], tooltip='Start', icon=folium.Icon(color='green')).add_to(m)
    folium.Marker(location_coords[end_loc], tooltip='End', icon=folium.Icon(color='red')).add_to(m)
    
    hotspot_coords = df[df['signal_status']>=2][['Latitude','Longitude']].values
    for coord in hotspot_coords:
        folium.CircleMarker(location=coord,radius=5,color='orange',fill=True,fill_opacity=0.7,tooltip='Hotspot').add_to(m)
    
    display(m)

display(widgets.interactive(optimized_shortest_path,
                             start_loc=widgets.Dropdown(options=unique_locations, description='Start Location'),
                             end_loc=widgets.Dropdown(options=unique_locations, description='End Location')
                            ))
interactive(children=(Dropdown(description='Start Location', options=(4, 5, 3, 2, 1), value=4), Dropdown(descr…

STEP 13: REAL-TIME TRAFFIC SIMULATION¶

In [18]:
from folium.plugins import TimestampedGeoJson
def simulate_real_time_animated(df, steps=10):
    # Create a list of GeoJSON features
    features = []

    df_sim = df.copy()

    for step in range(steps):
        # Randomly update traffic_volume and signal_status
        df_sim['traffic_volume'] = df_sim['traffic_volume'].apply(lambda x: max(0, x + random.randint(-10,10)))
        df_sim['signal_status'] = df_sim['traffic_volume'].apply(lambda x: 2 if x>500 else (1 if x>250 else 0))

        timestamp = pd.Timestamp.now() + pd.Timedelta(minutes=step*5)

        for idx, row in df_sim.iterrows():
            if row['signal_status'] >= 1:  # Only show moderate/high congestion
                feature = {
                    'type': 'Feature',
                    'geometry': {
                        'type': 'Point',
                        'coordinates': [row['Longitude'], row['Latitude']]
                    },
                    'properties': {
                        'time': timestamp.isoformat(),
                        'style': {'color': 'red' if row['signal_status']==2 else 'orange'},
                        'icon': 'circle',
                        'iconstyle':{
                            'fillColor': 'red' if row['signal_status']==2 else 'orange',
                            'fillOpacity': 0.7,
                            'radius': 6
                        }
                    }
                }
                features.append(feature)

    # Create a base map
    m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=13)

    TimestampedGeoJson({
        'type': 'FeatureCollection',
        'features': features,
    }, period='PT5M', add_last_point=True, auto_play=True, loop=False, max_speed=1).add_to(m)

    display(m)

# Run simulation
simulate_real_time_animated(df, steps=10)
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: